"""
Scrapy管道
"""
import re
from scrapy.exceptions import DropItem

class DataValidationPipeline:
    """数据验证管道"""
    
    def process_item(self, item, spider):
        # 验证邮箱格式
        if item.get('email'):
            email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
            if not re.match(email_pattern, item['email']):
                item['email'] = ''
        
        # 验证电话格式
        if item.get('phone'):
            phone = re.sub(r'[^\d]', '', item['phone'])
            if len(phone) < 7:
                item['phone'] = ''
        
        return item

class DeduplicationPipeline:
    """去重管道"""
    
    def __init__(self):
        self.seen_items = set()
    
    def process_item(self, item, spider):
        # 基于邮箱和电话去重
        key = f"{item.get('email', '')}_{item.get('phone', '')}"
        if key in self.seen_items:
            raise DropItem(f"重复数据: {key}")
        
        self.seen_items.add(key)
        return item